package com.chajia.pageprocess;

import com.chajia.dao.ProductDao;
import com.chajia.dao.ShopDao;
import com.chajia.model.Shop;
import com.chajia.util.LogUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.math.BigInteger;
import java.util.List;
import java.util.regex.*;

/**
 * Created with IntelliJ IDEA.
 * User: Administrator
 * Date: 14-1-19
 * Time: 上午9:47
 * 按天猫的店铺维度
 */
public class TmallProcess implements PageProcessor {
    public static Class clazz= TmallProcess.class;
    private Site site = Site.me().setDomain("www.tmall.com").setSleepTime(500);

    @Override
    public void process(Page page) {
        List<String> links = page.getHtml().links().regex("^http://\\w+\\.tmall\\.com/.*").all();
        //add urls to fetch
        page.addTargetRequests(links);

        Selectable url =page.getUrl();
        //截取一级子域名，为统计店铺做准备
        String sUrl = url.regex("^http://\\w+.tmall.com").toString() ;
        try {
            if (ShopDao.isExist(sUrl)){
                page.setSkip(true);
            }
        } catch (Exception e) {
            LogUtil.error(clazz, e.getMessage());
        }

        if((page.getHtml().xpath("//span[@class='slogo']").toString()!=null)){
            page.putField("sUrl",page.getHtml().xpath("//span[@class='slogo']/a/@href").regex("^http://\\w+.tmall.com").toString());
            if(page.getHtml().xpath("//span[@class='slogo']/a/text()").toString().equals("")) {
                page.setSkip(true);
            }else{
                page.putField("shopName",page.getHtml().xpath("//span[@class='slogo']/a/text()").toString());
            }
            }
        if(page.getHtml().xpath("//span[@class='tbsHeader-slogo']").toString()!=null ){
        page.putField("sUrl",page.getHtml().xpath("//span[@class='tbsHeader-slogo']/a/@href").regex("^http://\\w+.tmall.com").toString());
        if(page.getHtml().xpath("//span[@class='tbsHeader-slogo']/a/text()").toString().equals("")) {
            page.setSkip(true);
        }else{
            page.putField("shopName",page.getHtml().xpath("//span[@class='tbsHeader-slogo']/a/text()").toString());
        }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
}
